Help! Why does the code always read the last column of my dataset and throw an error

by: Harris, 7 years ago

Last edited: 7 years ago


from collections import Counter
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn import svm, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

# calculate the percentage change in labels to normlaize the features

def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)

    #  Create new columns to hold the values of the percentage change of that day
    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker, i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]

    df.fillna(0, inplace=True)
    return tickers, df

# process_data_for_labels('XOM')

def buy_sell_hold(*args):
    # Breaks down the series into an array of its values
    cols = [c for c in args]
    requirement = 0.025
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0

def extract_featuresets(ticker):
    # returns ticker and new data frame with forward data joined
    tickers, df = process_data_for_labels(ticker)

    # Apply buy_sell_hold to each series that has been broken down to an array by the for loop
    # The multiple parameters of list will be taken as a sum and consolidated as a single list
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold, df['{}_1d'.format(ticker)], df['{}_2d'.format(ticker)], df['{}_3d'.format(ticker)], df['{}_4d'.format(ticker)], df['{}_5d'.format(ticker)], df['{}_6d'.format(ticker)], df['{}_7d'.format(ticker)]))

    print df.columns

    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print 'Data spread:', Counter(str_vals)

    df.fillna(0, inplace=True)

    # replace infinity values with nan
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)

    # Returns a dataframe of the percentage change from yesterday (price today - price yesterday)
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)

    X = df_vals.values
    y = df['{}_target'.format(ticker)].values

    return X,y,df

def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    # clf = neighbors.KNeighborsClassifier()

    clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestCLassifier())])

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print 'Accuracy', confidence
    predictions = clf.predict(X_test)
    # The spread is to see whether our predictions are skewed because the model thinks that a certain result is alot more accuarate
    print 'Predicted spread: ', Counter(predictions)

    return confidence

do_ml('AAPL')



My csv file is a table with multiple columns that end at the stock ticker name BBBY.

And this is the output and error I keep getting when I call do_ml('AAPL')


Traceback (most recent call last):
  File "finance_12.py", line 68, in <module>
    extract_featuresets('AAPL')
  File "finance_12.py", line 64, in extract_featuresets
    y = df['{}_target'.format(ticker)].values
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/frame.py", line 2059, in __getitem__
    return self._getitem_column(key)
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/frame.py", line 2066, in _getitem_column
    return self._get_item_cache(key)
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/generic.py", line 1386, in _get_item_cache
    values = self._data.get(item)
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/core/internals.py", line 3543, in get
    loc = self.items.get_loc(item)
  File "/Users/harrisleow/anaconda/lib/python2.7/site-packages/pandas/indexes/base.py", line 2136, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/index.pyx", line 132, in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)
  File "pandas/index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)
  File "pandas/src/hashtable_class_helper.pxi", line 732, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)
  File "pandas/src/hashtable_class_helper.pxi", line 740, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)
KeyError: 'BBBY_target'




You must be logged in to post. Please login or register an account.



Ok I solved it. We shouldn't be naming the ticker variable as ticker in df_vals = df[[ticker for ticker in tickers]].pct_change() as it will replace the ticker parameter passed from the function as the last ticker name in your csv file

-Harris 7 years ago

You must be logged in to post. Please login or register an account.


can u please post the changes in code to execute thanks


-krishnagutta 7 years ago

You must be logged in to post. Please login or register an account.